In [1]:
%load_ext vimception
In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
%reload_ext autoreload
In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
Fei Liu and others collected string pairs from social media and this excellent source is available at:
http://www.hlt.utdallas.edu/~yangl/data/Text_Norm_Data_Release_Fei_Liu/
In [4]:
lines = open('examples/Test_Set_3802_Pairs.txt', 'r').readlines()
ppairs = [line.split('\t')[1].strip().split(' | ') for line in lines]
ppairs = [(pair[0], pair[i]) for pair in ppairs for i in xrange(1, len(pair))]
print len(ppairs)
ppairs[:5]
Out[4]:
Let's keep 1000 of these pairs out to evaluate the final performance of the model.
In [5]:
from sklearn.cross_validation import train_test_split
ppairs_train, ppairs_test = train_test_split(ppairs, test_size=1000, random_state=1)
ppairs_train = [tuple(ppair_train) for ppair_train in ppairs_train]
ppairs_test = [tuple(ppair_test) for ppair_test in ppairs_test]
print len(ppairs_train), len(ppairs_test)
In [6]:
from numpy.random import shuffle
incorrect = list(zip(*ppairs_train)[0])
shuffle(incorrect)
correct = list(zip(*ppairs_train)[1])
npairs_train = zip(incorrect, correct)
npairs_train[:5]
Out[6]:
Concatenate the positive and negative examples and create labels - 0
for matching pairs and 1
for non-matching pairs.
In [7]:
x_raw = ppairs_train + npairs_train
y_orig = [0] * len(ppairs_train) + [1] * len(npairs_train)
In [9]:
from pyhacrf import StringPairFeatureExtractor
In [10]:
fe = StringPairFeatureExtractor(match=True, numeric=True, transition=True)
x_orig = fe.fit_transform(x_raw)
In [44]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42)
print y_train[:10], y_test[:10]
print len(y_train), len(x_train), len(y_test), len(x_test)
In [12]:
from pyhacrf import Hacrf
from scipy.optimize import fmin_l_bfgs_b
In [69]:
models = []
accs_train = []
accs_val = []
regs = []
In [70]:
for n, i in enumerate(np.linspace(-1, 1, 20)):
for repeat in xrange(5):
print n, r, 10.0**(i)
m = Hacrf(l2_regularization=10.0**(i), optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 100})
x_t, x_v, y_t, y_v = train_test_split(x_train, y_train, test_size=0.5, random_state=42 + n + repeat * 1000)
m.fit(x_t, y_t, verbosity=20)
train_score = accuracy_score(m.predict(x_t), y_t)
val_score = accuracy_score(m.predict(x_v), y_v)
print 10.0**(i), train_score, val_score
regs.append(10.0**(i))
models.append(m)
accs_train.append(train_score)
accs_val.append(val_score)
In [82]:
plt.xscale('log')
plt.scatter([10.0**np.log(i) for i in regs], [acc for acc in accs_val], marker='x', label='Validation set')
plt.scatter([10.0**np.log(i) for i in regs], [acc for acc in accs_train], c='r', marker='x', label='Training set')
plt.legend()
plt.title('HACRF with transition features')
plt.xlabel('Regularisation')
plt.ylabel('Accuracy')
plt.xlim(0.08, 11)
Out[82]:
In [86]:
import cPickle # Lets store the results in case we need them later
cPickle.dump((models, accs_train, accs_val, regs), open('models/val_for_reg_hacrf_t.pkl', 'wb'))
I initially thought that 1 is a good enough value to continue with, but eventually went with 10 because the transition weight pictures looked less noisy and more interesting.
In [102]:
m = Hacrf(l2_regularization=10.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45}, state_machine=None)
m.fit(x_train, y_train, verbosity=20)
Out[102]:
In [103]:
from sklearn.metrics import accuracy_score
In [104]:
from sklearn.metrics import confusion_matrix
pr = m.predict(x_train)
print confusion_matrix(y_train, pr)
print '{:.2}% error'.format((1 - accuracy_score(y_train, pr)) * 100)
In [105]:
pr = m.predict(x_test)
print confusion_matrix(y_test, pr)
print '{:.2}% error'.format((1 - accuracy_score(y_test, pr)) * 100)
The error on the test set is higher than that on the training set. This means that the model is probably still overfitting.
Let's try to visualise the learned transition matrix. There are many of these matrices, one per edit transition and per
class. So for this model, there are 3 x 2 = 6
. Let's just look at the match
ing class' substitute
edit transition.
In [106]:
plt.figure(figsize=(8, 8))
plt.imshow(m.parameters[0, 3:].reshape(63, 63)[:27, :27], interpolation='nearest', vmin=-0.8, vmax=0.8, cmap='seismic')
plt.xticks(range(27), fe.CHARACTERS[:27])
plt.yticks(range(27), fe.CHARACTERS[:27])
plt.title('Weights learned for the matching states')
plt.colorbar()
print m.parameters[0, :3]
In [107]:
plt.figure(figsize=(8, 8))
plt.imshow(m.parameters[1, 3:].reshape(63, 63)[:27, :27], interpolation='nearest', vmin=-0.8, vmax=0.8, cmap='seismic')
plt.xticks(range(27), fe.CHARACTERS[:27])
plt.yticks(range(27), fe.CHARACTERS[:27])
plt.title('Weights learned for the non-matching states')
plt.colorbar()
Out[107]:
In [182]:
fe_base = StringPairFeatureExtractor(match=True, numeric=True)
x_orig_base = fe_base.fit_transform(x_raw)
x_train_base, x_test_base, y_train_base, y_test_base = train_test_split(x_orig_base, y_orig, test_size=0.2, random_state=42)
models_base = []
accs_train_base = []
accs_val_base = []
regs_base = []
for n, i in enumerate(np.linspace(-10, 4, 25)):
print np.exp(i)
m_base = Hacrf(l2_regularization=np.exp(i), optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45})
x_t, x_v, y_t, y_v = train_test_split(x_train_base, y_train, test_size=0.5, random_state=42 + n)
m_base.fit(x_t, y_t, verbosity=5)
train_score = accuracy_score(m_base.predict(x_t), y_t)
val_score = accuracy_score(m_base.predict(x_v), y_v)
print np.exp(i), train_score, val_score
regs_base.append(np.exp(i))
models_base.append(m_base)
accs_train_base.append(train_score)
accs_val_base.append(val_score)
In [197]:
plt.xscale('log')
plt.scatter([(i) for i in regs_base], [acc for acc in accs_val_base], marker='x', label='Validation set')
plt.scatter([(i) for i in regs_base], [acc for acc in accs_train_base], c='r', marker='x', label='Training set')
plt.legend()
plt.xlabel('Regularisation parameter')
plt.ylabel('Accuracy')
Out[197]:
In [185]:
m_base = Hacrf(l2_regularization=0.1, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 25}, state_machine=None)
m_base.fit(x_train_base, y_train, verbosity=5)
pr = m_base.predict(x_train_base)
print 'Training score:'
print confusion_matrix(y_train, pr)
print '{:.2}% error'.format((1 - accuracy_score(y_train, pr)) * 100)
print 'Testing score:'
pr = m_base.predict(x_test_base)
print confusion_matrix(y_test, pr)
print '{:.2}% error'.format((1 - accuracy_score(y_test, pr)) * 100)
Now, let's take the 1000 pairs we held out at the start, and try to recover the correct token from a list of dictionary words given the incorrect one.
To do this, we'll construct a 1000 new sets of pairs. For each incorrect token, we'll construct a list of pairs where the first element in the pair is the incorrect token, and the second element is a candidate correct token from a dictionary.
In [33]:
dictionary = [line.split()[1].strip() for line in open('uk_word_freq.txt', 'r').readlines()[1:]]
print len(dictionary)
dictionary[:10]
Out[33]:
In [34]:
for incorrect, correct in ppairs_test[:10]:
test_pairs = [(incorrect, candidate) for candidate in set(dictionary[:10000] + [correct])]
gx_test = fe.transform(test_pairs)
pr = m.predict_proba(gx_test)
cr = zip(pr, test_pairs)
cr = sorted(cr, key=lambda x: -x[0][0])
print (incorrect, correct),
print [(candidate[1][1], '{:.3f}'.format(candidate[0][0])) for candidate in cr[:10]]
print
In [36]:
import editdistance
In [37]:
dictionary_length = len(dictionary)
dictionary_rank = dict([(word, dictionary_length - i) for i, word in enumerate(dictionary[::-1])])
dictionary_rank['a']
Out[37]:
In [161]:
result_file = open('levenshtein_generation_result.txt', 'a')
for i, (incorrect, correct) in enumerate(ppairs_test[:1000]):
print i,
test_pairs = [(incorrect, candidate) for candidate in set(dictionary[:20000] + [correct])]
pr = [editdistance.eval(*test_pair) for test_pair in test_pairs]
cr = zip(pr, test_pairs)
cr = sorted(cr, key=lambda x: x[0])
result_file.write('{} {} {}\n'.format(incorrect, correct,
[(candidate[1][1], '{:.5f}'.format(candidate[0])) for candidate in cr[:1000]]))
result_file.flush()
In [161]:
result_file = open('levenshtein_generation_result_using_rank.txt', 'a')
for i, (incorrect, correct) in enumerate(ppairs_test[:1000]):
print i,
test_pairs = [(incorrect, candidate) for candidate in set(dictionary[:20000] + [correct])]
pr = [editdistance.eval(*test_pair) for test_pair in test_pairs]
cr = zip(pr, test_pairs)
cr = sorted(cr, key=lambda x: (x[0], dictionary_rank.get(x[1][1], dictionary_length)))
result_file.write('{} {} {}\n'.format(incorrect, correct,
[(candidate[1][1], '{:.5f}'.format(candidate[0])) for candidate in cr[:1000]]))
result_file.flush()
In [108]:
import cPickle
In [114]:
cPickle.dump(m, open('models/m_mnt.pkl', 'wb'))
cPickle.dump(ppairs_test, open('models/ppairs_test.pkl', 'wb'))
cPickle.dump(dictionary, open('models/dictionary.pkl', 'wb'))
In [79]:
result_file = open('generation_result.txt', 'a')
for i, (incorrect, correct) in enumerate(ppairs_test[:1000]):
print i,
test_pairs = [(incorrect, candidate) for candidate in set(dictionary[:20000] + [correct])]
gx_test = fe.transform(test_pairs)
pr = m.predict_proba(gx_test)
cr = zip(pr, test_pairs)
cr = sorted(cr, key=lambda x: -x[0][0])
result_file.write('{} {} {}\n'.format(incorrect, correct,
[(candidate[1][1], '{:.5f}'.format(candidate[0][0])) for candidate in cr[:1000]]))
result_file.flush()
In [186]:
result_file = open('generation_result_with_pre_rank_no_transition.txt', 'a')
for i, (incorrect, correct) in enumerate(ppairs_test[:1000]):
print i,
test_pairs = [(incorrect, candidate) for candidate in set(dictionary[:20000] + [correct])]
pr = [editdistance.eval(*test_pair) for test_pair in test_pairs]
cr = zip(pr, test_pairs)
cr = sorted(cr, key=lambda x: (x[0], dictionary_rank.get(x[1][1], dictionary_length)))
new_candidates = [candidate[1][1] for candidate in cr[:1000]]
test_pairs = [(incorrect, candidate) for candidate in new_candidates]
gx_test = fe_base.transform(test_pairs)
pr = m_base.predict_proba(gx_test)
cr = zip(pr, test_pairs)
cr = sorted(cr, key=lambda x: -x[0][0])
result_file.write('{} {} {}\n'.format(incorrect, correct,
[(candidate[1][1], '{:.5f}'.format(candidate[0][0])) for candidate in cr[:1000]]))
result_file.flush()
In [38]:
result_file = open('generation_result_with_pre_rank.txt', 'a')
for i, (incorrect, correct) in enumerate(ppairs_test[:1000]):
print i,
test_pairs = [(incorrect, candidate) for candidate in set(dictionary[:20000] + [correct])]
pr = [editdistance.eval(*test_pair) for test_pair in test_pairs]
cr = zip(pr, test_pairs)
cr = sorted(cr, key=lambda x: (x[0], dictionary_rank.get(x[1][1], dictionary_length)))
new_candidates = [candidate[1][1] for candidate in cr[:1000]]
test_pairs = [(incorrect, candidate) for candidate in new_candidates]
gx_test = fe.transform(test_pairs)
pr = m.predict_proba(gx_test)
cr = zip(pr, test_pairs)
cr = sorted(cr, key=lambda x: -x[0][0])
result_file.write('{} {} {}\n'.format(incorrect, correct,
[(candidate[1][1], '{:.5f}'.format(candidate[0][0])) for candidate in cr[:1000]]))
result_file.flush()
In [89]:
cutoffs = [1, 3, 20, 100, 1000]
print '{:30} {}'.format('Method', ' '.join(['{:4d}'.format(cutoff) for cutoff in cutoffs]))
for method, result_file in [('Levenshtein', 'levenshtein_generation_result.txt'),
('Levenshtein rank', 'levenshtein_generation_result_using_rank.txt'),
('Hacrf no transition', 'generation_result.txt'),
('Hacrf no transition (pre-rank)', 'generation_result_with_pre_rank_no_transition.txt'),
('Hacrf transition (pre-rank)', 'generation_result_with_pre_rank.txt')]:
results = [line.split() for line in open(result_file, 'r').readlines()[:588]]
results = [(line[1], [candidate[0] for candidate in eval(''.join(line[2:]))]) for line in results]
result_indices = [line[1].index(line[0]) if line[0] in line[1] else 1001 for line in results]
print '{:30} {}'.format(method, ' '.join(['{:.2f}'.format(sum([1.0 if res < cutoff else 0.0 for res in result_indices]) / len(result_indices))
for cutoff in cutoffs]))